# download_ets_gdrive_live.py
# ET&S (Educational Technology and Society) Downloader
# Handles Google Drive-hosted PDFs for ET&S issues
# - Supports open?id= and /file/d/ formats
# - Filters for Full Length Articles
# - Applies dynamic filename sanitization

import requests
from bs4 import BeautifulSoup
import re
import os

def sanitize(title):
    return re.sub(r'[\\/:*?"<>|]', '', title)

issue_url = input("Enter ET&S issue URL: ").strip()

headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(issue_url, headers=headers)
soup = BeautifulSoup(resp.text, 'html.parser')

page_title = soup.find("title").get_text(strip=True) if soup.find("title") else "ETS_Issue"
issue_name = sanitize(page_title)
folder = f"ETS_{issue_name}"
os.makedirs(folder, exist_ok=True)

articles = []

# ✅ Grab every Google Drive link on the page
for a in soup.find_all('a', href=True):
    href = a['href']
    title = a.get_text(strip=True)

    # ✅ Skip TOC and Complete Issue
    if "Table of Content" in title or "Complete Issue" in title:
        continue

    file_id = None
    if "drive.google.com/open?id=" in href:
        file_id = href.split("id=")[-1]
    elif "/file/d/" in href:
        match = re.search(r"/file/d/([^/]+)/", href)
        if match:
            file_id = match.group(1)

    if file_id and title:
        direct_url = f"https://drive.google.com/uc?export=download&id={file_id}"
        articles.append((title, direct_url))

# ✅ Download PDFs
for title, pdf_url in articles:
    print(f"Downloading: {title}")
    r = requests.get(pdf_url, headers=headers, stream=True)
    fname = sanitize(title) + ".pdf"
    path = os.path.join(folder, fname)

    with open(path, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)

    print(f"✅ Saved: {fname}")

print(f"\nAll done! {len(articles)} PDFs are in {folder} folder.")
